import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
import os.path as op
import logging
import json
import random
import shutil
import numpy as np
from tqdm import tqdm
from pathlib import Path
from argparse import ArgumentParser
from typing import List, Tuple
from collections import OrderedDict
from image_synthesis.data.utils.tsv_file import TSVFile, tsv_writer, load_list_file
from image_synthesis.data.utils.tsv_utils import parallel_map, try_delete, concat_files, append_files, write_to_file, ensure_directory
from image_synthesis.data.tsv_dataset import TSVImageTextDataset
import csv


name = "conceptualcaption/train"
data_root = "/mnt/blob/code/dalle/data"
image_tsv_file = ['gcc-train-image-00.tsv','gcc-train-image-01.tsv'] 
text_tsv_file = ['gcc-train-text-00.tsv', 'gcc-train-text-01.tsv']
output_image_tsv_file = "gcc-train-image-28000.tsv" 
output_text_tsv_file = "gcc-train-text-28000.tsv"

text_format = "json"
indices_list_file = "/mnt/blob/code/dalle/data/filtered_conceptual_caption_train_index_min28000_max28000.txt" 

# tsv_file_path = "/mnt/blob/code/dalle/data/image_folders/val_min28000_max28000.tsv"

# output_image_folder = "/mnt/blob/code/dalle/data/image_folders/val_min28000_max28000"
# os.makedirs(output_image_folder, exist_ok=True)

tsv = TSVImageTextDataset(name=name, data_root=data_root, image_tsv_file=image_tsv_file, text_tsv_file=text_tsv_file, text_format=text_format, indices_list_file=indices_list_file)
print("length of this dataset is " + str(len(tsv)))



with open(output_image_tsv_file, 'w') as image_f:
    tsv_i = csv.writer(image_f, delimiter='\t')
    with open(output_text_tsv_file, 'w') as text_f:
        tsv_t = csv.writer(text_f, delimiter='\t',quoting=csv.QUOTE_NONE, quotechar=None, escapechar="|")
        for index in range(len(tsv)):
            real_index = tsv._indices[index]
            tsv_i.writerow(tsv.image_tsv_file[real_index])
            tsv_t.writerow([tsv.text_tsv_file[real_index][0], str(tsv.text_tsv_file[real_index][1])])
            if index % 1000 == 0:
                print(index)


print("finished")
print("over")


